Show the code
library(targets)
library(tidyverse)
library(ggokabeito)
library(easystats)
library(gt)
library(ggfittext)
library(scales)Wintersemester 2023/2024
library(targets)
library(tidyverse)
library(ggokabeito)
library(easystats)
library(gt)
library(ggfittext)
library(scales)theme_set(theme_minimal())JSON-Daten wurden nicht importiert, da offenbar nur redundante Daten enthalten sind.
tar_load(data_all_fct)Der Roh-Datensatz verfügt über
Jede Zeile entspricht einem “Visit”.
Entfernt man Developer, Admins und Lecturers aus dem Roh-Datensatz so bleiben weniger Zeilen übrig:
tar_load(data_users_only)tar_load(count_action)tar_load(time_minmax)time_minmax |>
summarise(time_min = min(time_min),
time_max = max(time_max)) |>
gt()| time_min | time_max |
|---|---|
| 2023-10-04 16:19:46 | 2024-02-26 18:27:42 |
Diese Statistik wurde auf Basis des Datenobjekts data_slim berechnet.
count_action |>
describe_distribution(n_max) |>
gt() |>
fmt_number(columns = where(is.numeric),
decimals = 2)| Variable | Mean | SD | IQR | Min | Max | Skewness | Kurtosis | n | n_Missing |
|---|---|---|---|---|---|---|---|---|---|
| n_max | 657.18 | 1,247.39 | 679.00 | 10.00 | 11,816.00 | 3.43 | 15.79 | 3,187.00 | 0.00 |
count_action2 <-
count_action |>
filter(n_max != 499)
count_action2 |>
describe_distribution(n_max) |>
gt() |>
fmt_number(columns = where(is.numeric),
decimals = 2)| Variable | Mean | SD | IQR | Min | Max | Skewness | Kurtosis | n | n_Missing |
|---|---|---|---|---|---|---|---|---|---|
| n_max | 657.23 | 1,247.58 | 681.25 | 10.00 | 11,816.00 | 3.43 | 15.78 | 3,186.00 | 0.00 |
count_action_avg = mean(count_action$n_max)
count_action_sd = sd(count_action$n_max)
count_action |>
ggplot() +
geom_histogram(aes(x = n_max)) +
labs(x = "Anzahl von Aktionen pro Visit",
y = "n",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD") +
theme_minimal() +
geom_vline(xintercept = count_action_avg,
color = palette_okabe_ito()[1]) +
geom_segment(x = count_action_avg-count_action_sd,
y = 0,
xend = count_action_avg + count_action_sd,
yend = 0,
color = palette_okabe_ito()[2],
size = 2) +
annotate("label", x = count_action_avg, y = 1500, label = "MW") +
annotate("label", x = count_action_avg + count_action_sd, y = 0, label = "SD") #geom_label(aes(x = count_action_avg), y = 1, label = "Mean")count_action_avg2 = mean(count_action2$n_max)
count_action_sd2 = sd(count_action2$n_max)
count_action2 |>
ggplot() +
geom_histogram(aes(x = n_max)) +
labs(x = "Anzahl von Aktionen pro Visit",
y = "n",
title = "Verteilung der User-Aktionen pro Visit",
caption = "Der vertikale Strich zeigt den Mittelwert; der horizontale die SD") +
theme_minimal() +
geom_vline(xintercept = count_action_avg2,
color = palette_okabe_ito()[1]) +
geom_segment(x = count_action_avg-count_action_sd2,
y = 0,
xend = count_action_avg2 + count_action_sd2,
yend = 0,
color = palette_okabe_ito()[2],
size = 2) +
annotate("label", x = count_action_avg2, y = 1500, label = "MW", vjust = "top") +
annotate("label", x = count_action_avg2 + count_action_sd2, y = 0, label = "SD", vjust = "bottom") #geom_label(aes(x = count_action_avg), y = 1, label = "Mean")Die Visit-Zeit wurde auf 600 Min. begrenzt.
tar_load(time_spent)time_spent <-
time_spent |>
mutate(t_min = as.numeric(time_diff, units = "mins")) |>
filter(t_min < 600)time_spent |>
summarise(
mean_time_diff = round(mean(time_diff), 2),
sd_time_diff = sd(time_diff),
min_time_diff = min(time_diff),
max_time_diff = max(time_diff)
) |>
gt() |>
fmt_number(columns = everything(),
decimals = 2)| mean_time_diff | sd_time_diff | min_time_diff | max_time_diff |
|---|---|---|---|
| 1329.38 | 2,212.42 | 0 | 19946 |
time_spent |>
summarise(
mean_t_min = mean(t_min),
sd_t_min = sd(t_min),
min_t_min = min(t_min),
max_t_min = max(t_min)
) |>
gt() |>
fmt_number(columns = everything(),
decimals = 2)| mean_t_min | sd_t_min | min_t_min | max_t_min |
|---|---|---|---|
| 22.16 | 36.87 | 0.00 | 332.43 |
time_spent |>
ggplot(aes(x = t_min)) +
geom_histogram() +
scale_x_time() +
theme_minimal() +
labs(y = "n",
x = "Verweildauer in HaNS pro Visit in Minuten")time_spent |>
ggplot(aes(x = t_min)) +
geom_histogram(binwidth = 5) +
theme_minimal() +
labs(y = "n",
x = "Verweildauer in Minuten",
title = "Verweildauer in HaNS pro Visit",
caption = "binwidth = 5 Min.")time_spent2 <-
time_spent |>
filter(t_min > 1, t_min < 120)
time_spent2 |>
ggplot(aes(x = t_min)) +
geom_histogram(binwidth = 10) +
theme_minimal() +
labs(y = "n",
x = "Verweildauer in HaNS pro Visit in Minuten",
title = "Verweildauer begrenzt auf 1-120 Minuten",
caption = "bindwidth = 10 Min.")tar_load(count_action_type)count_action_type |>
count(category, sort = TRUE) |>
gt()| category | n |
|---|---|
| NA | 1032265 |
| video | 138257 |
| click_slideChange | 11116 |
| visit_page | 10486 |
| login | 866 |
| click_topic | 829 |
| Search Results Count | 813 |
| in_media_search | 620 |
| Kanäle | 512 |
| Medien | 415 |
| click_channelcard | 293 |
| GESOA | 260 |
count_action_type |>
count(category, sort = TRUE) |>
ggplot(aes(y = reorder(category, n), x = n)) +
geom_col() +
geom_bar_text() +
labs(
x = "User-Aktion",
y = "Aktion",
title = "Anzahl der User-Aktionen nach Kategorie"
) +
theme_minimal() +
scale_x_continuous(labels = scales::comma)count_action_type |>
count(category, sort = TRUE) |>
ggplot(aes(y = reorder(category, n), x = n)) +
geom_col() +
geom_bar_text() +
labs(
x = "Anazhl der User-Aktionen",
y = "Aktion",
title = "Anzahl der User-Aktionen nach Kategorie",
caption = "Log10-Skala"
) +
theme_minimal() +
scale_x_log10()tar_load(time_visit_wday)# Define a vector with the names of the days of the week
# Note: Adjust the start of the week (Sunday or Monday) as per your requirement
days_of_week <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
# Replace numbers with day names
time_visit_wday$dow2 <- factor(days_of_week[time_visit_wday$dow],
levels = days_of_week)time_visit_wday |>
as_tibble() |>
count(hour) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
theme_minimal() +
labs(
title = "HaNS-Nutzer sind keine Frühaufsteher",
x = "Uhrzeit",
y = "Anteil"
) # coord_polar()time_visit_wday |>
as_tibble() |>
count(hour) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
theme_minimal() +
coord_polar()time_visit_wday |>
as_tibble() |>
count(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = dow2, y = prop)) +
geom_col() +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen",
x = "Wochentag",
y = "Anteil") # coord_polar()time_visit_wday |>
as_tibble() |>
count(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = dow2, y = prop)) +
geom_col() +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen",
x = "Wochentag",
y = "Anteil") +
coord_polar()time_visit_wday |>
as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~ dow2) +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil") # coord_polar()time_visit_wday |>
as_tibble() |>
count(dow2, hour) |>
group_by(dow2) |>
mutate(prop = n/sum(n)) |>
ggplot(aes(x = hour, y = prop)) +
geom_col() +
facet_wrap(~ dow2) +
theme_minimal() +
labs(title = "Verteilung der HaNS-Logins nach Wochentagen und Uhrzeiten",
x = "Wochentag",
y = "Anteil") +
coord_polar()time2 <-
time_visit_wday |>
ungroup() |>
mutate(date = as.Date(date_time))
time2 |>
ggplot(aes(x = date, y = hour)) +
geom_bin2d(binwidth = c(1, 1)) + # (1 day, 1 hour)
scale_x_date(date_breaks = "1 month") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(caption = "Each x-bin maps to one week")time2 |>
ggplot(aes(x = date, y = hour)) +
geom_bin2d(binwidth = c(7, 1)) + # 1 week, 1 hour
scale_x_date(date_breaks = "1 week", date_labels = "%W") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(x = "Week number in 2023/2024",
caption = "Each x-bin maps to one week")time2 |>
ggplot(aes(x = date, y = dow)) +
geom_bin2d(binwidth = c(7, 1)) + # 1 week, 1 hour
scale_x_date(date_breaks = "1 week", date_labels = "%W") +
theme(legend.position = "bottom") +
scale_fill_viridis_c() +
labs(x = "Week number in 2023/2024",
caption = "Each x-bin maps to one week",
y = "Day of Week") +
scale_y_continuous(breaks = 1:7)tar_load(data_slim)data_slim |>
filter(type == "subtitle") |>
filter(!is.na(value) & value != "") |>
count(str_detect(value, "click_transcript_word")) str_detect(value, "click_transcript_word") n
1 FALSE 185001
2 TRUE 2472